from typing import List
from pydantic import BaseModel, Field



from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import  GPTVectorStoreIndex,VectorStoreIndex
from llama_index.llms import openai_like
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # HuggingFaceEmbedding:用于将文本转换为词向量
from llama_index.llms.huggingface import HuggingFaceLLM  # HuggingFaceLLM：用于运行Hugging Face的预训练语言模型
from llama_index.core import Settings,SimpleDirectoryReader,VectorStoreIndex
import chromadb
from llama_index.embeddings.dashscope import DashScopeEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.llms.deepseek  import DeepSeek
from llama_index.embeddings.fastembed import FastEmbedEmbedding

from llama_index.core import QueryBundle

# import NodeWithScore
from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
    # 连接Chroma数据库


llm = DeepSeek(model="deepseek-chat", api_key="sk-605e60a1301040759a821b6b677556fb")
Settings.llm = llm
 
from zhipuai import ZhipuAI
from llama_index.embeddings.zhipuai import ZhipuAIEmbedding

embeddings = ZhipuAIEmbedding(
    model="embedding-2",
    api_key="f387f5e4837d4e4bba6d267682a957c9.PmPiTw8qVlsI2Oi5"
    # With the `embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)
Settings.embed_model=embeddings


from llama_index.core import Document

from llama_index.core.extractors import DocumentContextExtractor
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.storage.docstore.simple_docstore import (
    SimpleDocumentStore,
)
 

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_files=["./paul_graham_essay_ambiguated.txt"]
)
documents = reader.load_data()

# Initialize document store and embedding model
docstore = SimpleDocumentStore()
 

# Create storage contexts
storage_context = StorageContext.from_defaults(docstore=docstore)
storage_context_no_extra_context = StorageContext.from_defaults()
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=256, chunk_overlap=10
)




context_extractor = DocumentContextExtractor(
    # these 2 are mandatory
    docstore=docstore,
    max_context_length=200,
    # below are optional
    llm=llm,  # default to Settings.llm
    oversized_document_strategy="warn",
    max_output_tokens=20,
    key="context",
    prompt=DocumentContextExtractor.SUCCINCT_CONTEXT_PROMPT,
)

print (documents)
output=context_extractor.extract(documents)
print(output)
'''

# need to add documents directly for the DocumentContextExtractor to work
storage_context.docstore.add_documents(documents)
index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context,
    embed_model=embeddings,
    transformations=[text_splitter, context_extractor],
)

index_nocontext = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context_no_extra_context,
    embed_model=embeddings,
    transformations=[text_splitter],
)

test_question = "Which chunks of text discuss the IBM 704?"
retriever = index.as_retriever(similarity_top_k=2)
nodes_fromcontext = retriever.retrieve(test_question)

retriever_nocontext = index_nocontext.as_retriever(similarity_top_k=2)
nodes_nocontext = retriever_nocontext.retrieve(test_question)

# Print each node's content
print("==========")
print("NO CONTEXT")
for i, node in enumerate(nodes_nocontext, 1):
    print(f"\nChunk {i}:")
    print(f"Score: {node.score}")  # Similarity score
    print(f"Content: {node.node.text}")  # The actual text content

# Print each node's content
print("==========")
print("WITH CONTEXT")
for i, node in enumerate(nodes_fromcontext, 1):
    print(f"\nChunk {i}:")
    print(f"Score: {node.score}")  # Similarity score mDocumentContextExtractor
    print(f"Content: {node.node.text}")  # The actual text content
'''